root@nvidia-desktop:/home/nvidia/tensorrt/bin# /usr/local/cuda/bin/nvprof --profile-from-start off  --print-gpu-trace  ./trtexec --deploy=ResNet-50-deploy.prototxt --output=prob --int8 --batch=2  --avgRuns=1 --iterations=1
&&&& RUNNING TensorRT.trtexec # ./trtexec --deploy=ResNet-50-deploy.prototxt --output=prob --int8 --batch=2 --avgRuns=1 --iterations=1
[I] deploy: ResNet-50-deploy.prototxt
[I] output: prob
[I] int8
[I] batch: 2
[I] avgRuns: 1
[I] iterations: 1
==9136== NVPROF is profiling process 9136, command: ./trtexec --deploy=ResNet-50-deploy.prototxt --output=prob --int8 --batch=2 --avgRuns=1 --iterations=1
[I] Input "data": 3x224x224
[I] Output "prob": 1000x1x1
==9136== Warning: Unified Memory Profiling is not supported on the underlying platform. System requirements for unified memory can be found at: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
[I] Average over 1 runs is 1977.77 ms (host walltime is 1989.45 ms, 99% percentile time is 1977.77).
&&&& PASSED TensorRT.trtexec # ./trtexec --deploy=ResNet-50-deploy.prototxt --output=prob --int8 --batch=2 --avgRuns=1 --iterations=1
==9136== Profiling application: ./trtexec --deploy=ResNet-50-deploy.prototxt --output=prob --int8 --batch=2 --avgRuns=1 --iterations=1
==9136== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*           Device   Context    Stream  Name
147.216s  38.338us           (2352 1 1)       (128 1 1)        22        0B        0B       Xavier (0)         1        86  void genericReformat::copyPackedKernel<float, char, bool=1, bool=0, genericReformat::IdentityCoordMapper<int=4>, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN<genericReformat::IdentityCoordMapper<int=4>>, genericReformat::ArrayNWithReducedDivisors<genericReformat::IdentityCoordMapper<int=4>>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) [336420]
147.216s  86.083us              (8 8 2)       (256 1 1)       123        0B  16.000KB       Xavier (0)         1        86  void first_layer_imma_fwd_kernel<int=8, int=7, int=7, int=64>(FirstLayerIMMAFwdParams) [336421]
147.216s  38.114us            (392 2 1)       (128 1 1)        31        0B        0B       Xavier (0)         1        86  void nvinfer1::poolNCxHWxInt8<nvinfer1::PoolingType>(nvinfer1::IMMAInt8PackedArray const *, nvinfer1::poolNCxHWxInt8<nvinfer1::PoolingType>*, int, int, nvinfer1::rt::reduced_divisor, int, int, int, nvinfer1::rt, int, int, int, int, int, int, nvinfer1::rt, float, float, nvinfer1::IMMAFloatPackedArray const *, nvinfer1::IMMAFloatPackedArray const , int, int) [336423]
147.216s  34.274us             (25 1 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 [336426]
147.216s  59.650us             (25 1 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 [336428]
147.216s  59.426us             (49 2 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336430]
147.216s  75.459us             (49 2 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_interior_nt_v1 [336432]
147.216s  49.474us             (25 1 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 [336434]
147.216s  58.018us             (25 1 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 [336436]
147.216s  77.603us             (49 2 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336438]
147.217s  50.050us             (25 1 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_medium_nt_v1 [336440]
147.217s  56.387us             (25 1 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 [336442]
147.217s  77.891us             (49 2 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_interior_nt_v1 [336444]
147.217s  26.049us             (13 1 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336446]
147.217s  40.546us              (7 2 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 [336448]
147.217s  34.561us             (13 4 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336450]
147.217s  60.163us             (13 4 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336452]
147.217s  21.537us             (13 1 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336454]
147.217s  35.969us             (13 1 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336456]
147.217s  42.274us             (13 4 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336458]
147.217s  22.273us             (13 1 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336460]
147.217s  48.386us              (7 1 1)       (256 1 1)       254  48.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x128_ldg16_relu_small_nt_v1 [336462]
147.217s  45.666us             (13 4 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336464]
147.217s  31.073us             (13 1 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_interior_nt_v1 [336466]
147.217s  43.970us              (7 1 1)       (256 1 1)       254  48.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x128_ldg16_relu_small_nt_v1 [336468]
147.217s  58.882us             (13 4 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_medium_nt_v1 [336470]
147.218s  29.345us              (4 2 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336472]
147.218s  49.730us              (4 2 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336474]
147.218s  41.474us             (2 16 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 [336476]
147.218s  59.586us              (4 8 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_medium_nt_v1 [336478]
147.218s  38.946us              (4 2 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336480]
147.218s  62.083us              (2 4 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 [336482]
147.218s  37.217us              (4 8 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336484]
147.218s  28.193us              (4 2 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336486]
147.218s  48.737us              (2 4 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 [336488]
147.218s  48.130us             (2 16 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 [336490]
147.218s  30.529us              (2 4 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 [336492]
147.218s  50.818us              (2 4 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 [336494]
147.218s  34.273us             (2 16 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 [336496]
147.218s  39.521us              (4 2 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336498]
147.218s  58.274us              (2 4 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 [336500]
147.219s  44.610us             (2 16 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_interior_nt_v1 [336502]
147.219s  37.218us              (4 2 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336504]
147.219s  59.971us              (2 4 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 [336506]
147.219s  36.577us             (2 16 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 [336508]
147.219s  27.041us              (1 8 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 [336510]
147.219s  100.26us              (1 4 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336512]
147.219s  34.305us             (1 16 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_interior_nt_v1 [336514]
147.219s  41.986us             (1 16 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336516]
147.219s  61.826us              (1 8 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 [336518]
147.219s  92.356us              (1 8 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 [336520]
147.219s  25.057us             (1 16 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336522]
147.219s  49.442us              (1 4 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336524]
147.219s  98.084us              (1 8 1)       (128 1 1)       254  20.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_256x64_ldg16_relu_singleBuffer_small_nt_v1 [336526]
147.219s  25.025us             (1 16 1)       (128 1 1)       254  32.000KB        0B       Xavier (0)         1        86  trt_volta_int8_i8816cudnn_int8_128x128_ldg16_relu_small_nt_v1 [336528]
147.219s  8.6090us              (4 2 1)       (128 1 1)        31        0B        0B       Xavier (0)         1        86  void nvinfer1::poolNCxHWxInt8<nvinfer1::PoolingType>(nvinfer1::IMMAInt8PackedArray const *, nvinfer1::poolNCxHWxInt8<nvinfer1::PoolingType>*, int, int, nvinfer1::rt::reduced_divisor, int, int, int, nvinfer1::rt, int, int, int, int, int, int, nvinfer1::rt, float, float, nvinfer1::IMMAFloatPackedArray const *, nvinfer1::IMMAFloatPackedArray const , int, int) [336530]
147.220s  5.6320us             (32 1 1)       (128 1 1)        22        0B        0B       Xavier (0)         1        86  void genericReformat::copyPackedKernel<char, float, bool=1, bool=0, genericReformat::IdentityCoordMapper<int=4>, int=4>(unsigned int, unsigned int, void const *, genericReformat::ArrayN<genericReformat::IdentityCoordMapper<int=4>>, genericReformat::ArrayNWithReducedDivisors<genericReformat::IdentityCoordMapper<int=4>>, genericReformat::ArrayN, int, int, int, float const *, void*, genericReformat::ArrayN, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayNWithReducedDivisors, genericReformat::ArrayN, int, int, int, float const , int=4) [336533]
147.220s  88.387us             (16 1 1)       (256 1 1)        56  6.0000KB        0B       Xavier (0)         1        86  void gemmSN_NN_kernel<float, int=256, int=4, int=2, int=8, int=2, int=4, cublasGemvTensorStridedBatched<float const >, cublasGemvTensorStridedBatched<float>>(cublasGemmSmallNParams<float const , cublasGemvTensorStridedBatched<float const >, float>) [336534]
147.220s  4.4480us             (64 1 1)       (256 1 1)        17        0B        0B       Xavier (0)         1        86  void op_generic_tensor_kernel<int=2, float, float, float, int=256, cudnnGenericOp_t=0, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, int=0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, cudnnTensorStruct, float const *, float, float, float, float, dimArray, reducedDivisorArray, bool) [336543]
147.220s  6.8800us              (1 1 2)       (256 1 1)        23  1.0039KB        0B       Xavier (0)         1        86  void cudnn::detail::softmax_fw_kernel<int=2, float, float, int=256, int=1, int=1, int=0>(cudnnTensorStruct, float const *, cudnn::detail::softmax_fw_kernel<int=2, float, float, int=256, int=1, int=1, int=0>, cudnnTensorStruct*, int, float, cudnnTensorStruct*, int, int) [336545]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
